import data_utils
import pandas as pd
import numpy as np
from data_utils import *
import os
#Load data
file_path = 'bq-results1.csv'
df = load_fraud_data(file_path)
CLUSTERED BARCHART VISUALISATION OF TRANSACTION BY FRAUD CLASS
import plotly.express as px
import pandas as pd
fraud_list = df[df['isFraud'] == 1]
non_fraud_list = df[df['isFraud'] == 0]
fraud_x = fraud_list['type'].value_counts().index.tolist()
fraud_y = fraud_list['type'].value_counts().values.tolist()
nonfraud_x = non_fraud_list['type'].value_counts().index.tolist()
nonfraud_y = non_fraud_list['type'].value_counts().values.tolist()
fraud_data = pd.DataFrame({'Transaction_Type': fraud_x, 'Count': fraud_y, 'Name': 'Fraud'})
non_fraud_data = pd.DataFrame({'Transaction_Type': nonfraud_x, 'Count': nonfraud_y, 'Name': 'Non-Fraud'})
combined_data = pd.concat([fraud_data, non_fraud_data], ignore_index=True)
fig = px.bar(combined_data, x='Transaction_Type', y='Count', color='Name', color_discrete_map={'Fraud': 'red', 'Non-Fraud': 'grey'})
fig.update_layout(
title='Clustered Bar Chart of Transaction Types',
xaxis_title='Transaction Type',
yaxis_title='Log Count',
barmode='group',
height=500,
width=800,
)
# Apply logarithmic scale to the y-axis
fig.update_yaxes(type="log", exponentformat='none')
fig.show()
MULTIVARIATE ANALYSIS
1.CORRELATION MATRIX
# Correlation Matrix
correlation_matrix = df.corr(numeric_only=True)
print('CORRELATION')
correlation_matrix
CORRELATION
| step | amount | oldbalanceOrg | newbalanceOrig | oldbalanceDest | newbalanceDest | isFraud | isFlaggedFraud | |
|---|---|---|---|---|---|---|---|---|
| step | 1.000000 | 0.022373 | -0.010058 | -0.010299 | 0.027665 | 0.025888 | 0.031578 | 0.003277 |
| amount | 0.022373 | 1.000000 | -0.002762 | -0.007861 | 0.294137 | 0.459304 | 0.076688 | 0.012295 |
| oldbalanceOrg | -0.010058 | -0.002762 | 1.000000 | 0.998803 | 0.066243 | 0.042029 | 0.010154 | 0.003835 |
| newbalanceOrig | -0.010299 | -0.007861 | 0.998803 | 1.000000 | 0.067812 | 0.041837 | -0.008148 | 0.003776 |
| oldbalanceDest | 0.027665 | 0.294137 | 0.066243 | 0.067812 | 1.000000 | 0.976569 | -0.005885 | -0.000513 |
| newbalanceDest | 0.025888 | 0.459304 | 0.042029 | 0.041837 | 0.976569 | 1.000000 | 0.000535 | -0.000529 |
| isFraud | 0.031578 | 0.076688 | 0.010154 | -0.008148 | -0.005885 | 0.000535 | 1.000000 | 0.044109 |
| isFlaggedFraud | 0.003277 | 0.012295 | 0.003835 | 0.003776 | -0.000513 | -0.000529 | 0.044109 | 1.000000 |
fig3 = go.Figure(data=[go.Heatmap(
z=correlation_matrix.values,
x=correlation_matrix.columns,
y=correlation_matrix.index,
colorscale='Cividis', # Updated colorscale
zmin=-1,
zmax=1,
hoverongaps=False
)])
# Update layout for a professional, publication-quality plot
fig3 = go.Figure(data=[go.Heatmap(
z=correlation_matrix.values,
x=correlation_matrix.columns,
y=correlation_matrix.index,
colorscale='Cividis',
zmin=-1,
zmax=1,
hoverongaps=False
)])
# Add text annotations for label inside each box
for i, row in enumerate(correlation_matrix.index):
for j, col in enumerate(correlation_matrix.columns):
fig3.add_annotation(
x=col,
y=row,
text=f"{correlation_matrix.loc[row, col]:.4f}",
font=dict(size=12),
showarrow=False,
xref="x",
yref="y"
)
# Update layout for a professional, publication-quality plot
fig3.update_layout(
title='Correlation Heatmap',
xaxis=dict(tickmode='linear'),
yaxis=dict(tickmode='linear'),
font=dict(
family="Arial, monospace",
size=14,
color="#000000"
),
margin=dict(l=100, r=100, t=100, b=100),
paper_bgcolor="white",
autosize=False,
width=1100,
height=700
)
# Show the figure
fig3.show()
1.FILTERING OF TRANSACTION TYPES WITH BOTH FRAUD CLASSES
filtered_df = filter_fraud_data(df)
filtered_df.head()
| step | type | amount | nameOrig | oldbalanceOrg | newbalanceOrig | nameDest | oldbalanceDest | newbalanceDest | isFraud | isFlaggedFraud | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 4 | 162 | CASH_OUT | 219742.61 | C493470768 | 25575.0 | 0.00 | C861627372 | 323624.20 | 543366.81 | 0 | 0 |
| 5 | 133 | CASH_OUT | 219742.55 | C860234025 | 0.0 | 0.00 | C221750228 | 1365483.09 | 1569242.65 | 0 | 0 |
| 8 | 279 | TRANSFER | 219742.27 | C1763304788 | 0.0 | 0.00 | C290822335 | 759457.06 | 979199.33 | 0 | 0 |
| 9 | 257 | CASH_OUT | 219742.26 | C1788227639 | 29035.0 | 0.00 | C1568862543 | 686283.62 | 906025.88 | 0 | 0 |
| 10 | 352 | CASH_OUT | 219742.19 | C1880833503 | 904462.0 | 684719.81 | C1915634371 | 748163.19 | 967905.38 | 0 | 0 |
plot_pie_chart(df=filtered_df, column_name='type', title='Distribution of Fraud Transaction Types')
transformed_df = transform_transaction_type(df=filtered_df, column_name='type')
transformed_df.head()
| step | type | amount | nameOrig | oldbalanceOrg | newbalanceOrig | nameDest | oldbalanceDest | newbalanceDest | isFraud | isFlaggedFraud | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 4 | 162 | 1 | 219742.61 | C493470768 | 25575.0 | 0.00 | C861627372 | 323624.20 | 543366.81 | 0 | 0 |
| 5 | 133 | 1 | 219742.55 | C860234025 | 0.0 | 0.00 | C221750228 | 1365483.09 | 1569242.65 | 0 | 0 |
| 8 | 279 | 1 | 219742.27 | C1763304788 | 0.0 | 0.00 | C290822335 | 759457.06 | 979199.33 | 0 | 0 |
| 9 | 257 | 1 | 219742.26 | C1788227639 | 29035.0 | 0.00 | C1568862543 | 686283.62 | 906025.88 | 0 | 0 |
| 10 | 352 | 1 | 219742.19 | C1880833503 | 904462.0 | 684719.81 | C1915634371 | 748163.19 | 967905.38 | 0 | 0 |
transformed_df = data_encoding(df=transformed_df, column1='nameOrig', column2='nameDest')
transformed_df.head()
| step | type | amount | nameOrig | oldbalanceOrg | newbalanceOrig | nameDest | oldbalanceDest | newbalanceDest | isFraud | isFlaggedFraud | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 4 | 162 | 1 | 219742.61 | 2041924 | 25575.0 | 0.00 | 473203 | 323624.20 | 543366.81 | 0 | 0 |
| 5 | 133 | 1 | 219742.55 | 2568121 | 0.0 | 0.00 | 304416 | 1365483.09 | 1569242.65 | 0 | 0 |
| 8 | 279 | 1 | 219742.27 | 1091742 | 0.0 | 0.00 | 322753 | 759457.06 | 979199.33 | 0 | 0 |
| 9 | 257 | 1 | 219742.26 | 1127672 | 29035.0 | 0.00 | 149889 | 686283.62 | 906025.88 | 0 | 0 |
| 10 | 352 | 1 | 219742.19 | 1260399 | 904462.0 | 684719.81 | 241197 | 748163.19 | 967905.38 | 0 | 0 |
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif
features = transformed_df[['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg',
'newbalanceOrig', 'nameDest', 'oldbalanceDest', 'newbalanceDest',
'isFlaggedFraud']]
target = transformed_df['isFraud']
# Assuming 'features' and 'target' are already defined in your environment
# Selecting best features using ANOVA F-test
best_features = SelectKBest(score_func=f_classif, k='all')
fit = best_features.fit(features, target)
# Creating DataFrame for feature scores
featureScores = pd.DataFrame(data=fit.scores_, index=list(features.columns), columns=['ANOVA Score'])
featureScores = featureScores.sort_values(ascending=False, by='ANOVA Score')
# Creating subplots for heatmap visualization
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(6, 5))
# Define a color palette
colors = "YlGnBu"
# First subplot for the first 5 features
plt.subplot(1, 2, 1)
sns.heatmap(featureScores.iloc[:6, :], annot=True, cmap=colors, linewidths=0.4, linecolor='black', cbar=False, fmt='.2f')
plt.title('ANOVA Score : Part 1')
# Second subplot for the next 5 features
plt.subplot(1, 2, 2)
sns.heatmap(featureScores.iloc[6:13, :], annot=True, cmap=colors, linewidths=0.4, linecolor='black', cbar=False, fmt='.2f')
plt.title('ANOVA Score : Part 2')
# Adjust layout
fig.tight_layout(w_pad=2)
# Display the plot
plt.show()
C:\Users\Administrator\AppData\Roaming\Python\Python311\site-packages\sklearn\feature_selection\_univariate_selection.py:112: UserWarning: Features [1] are constant. C:\Users\Administrator\AppData\Roaming\Python\Python311\site-packages\sklearn\feature_selection\_univariate_selection.py:113: RuntimeWarning: invalid value encountered in divide
featureScores = featureScores.sort_values(by='ANOVA Score', ascending=False)
featureScores
| ANOVA Score | |
|---|---|
| oldbalanceOrg | 380694.276437 |
| amount | 13901.637857 |
| newbalanceOrig | 11236.549146 |
| step | 6578.263516 |
| isFlaggedFraud | 5391.619385 |
| oldbalanceDest | 620.175363 |
| newbalanceDest | 223.308306 |
| nameOrig | 1.505560 |
| nameDest | 1.020476 |
| type | NaN |
# Example usage:
df_fraud = transformed_df
threshold = 0.7
correlated_features = find_highly_correlated_features(df_fraud, threshold)
# Print out the highly correlated feature pairs
for feature, correlations in correlated_features.items():
print(f"{feature} is highly correlated with {correlations}")
oldbalanceOrg is highly correlated with ['newbalanceOrig'] newbalanceOrig is highly correlated with ['oldbalanceOrg'] oldbalanceDest is highly correlated with ['newbalanceDest'] newbalanceDest is highly correlated with ['oldbalanceDest']
sorted_feature_scores_df, fig = feature_selection_and_plot(df=df_fraud , target_col='isFraud')
fig.show()
sorted_feature_scores_df
| Feature | Score | |
|---|---|---|
| 0 | type | 0.254569 |
| 1 | step | 0.012048 |
| 2 | oldbalanceOrg | 0.008912 |
| 3 | newbalanceDest | 0.005851 |
| 4 | amount | 0.003426 |
| 5 | oldbalanceDest | 0.002059 |
| 6 | nameDest | 0.000797 |
| 7 | newbalanceOrig | 0.000431 |
| 8 | isFlaggedFraud | 0.000075 |
| 9 | nameOrig | 0.000000 |
1 New TransactionFeature Formation: 'bal_chg', 'orig_zero', 'amt_bal_ratio', 'chg_amt_ratio'
transformed_df = generate_transaction_features( transformed_df)
transformed_df.head()
| step | type | amount | nameOrig | oldbalanceOrg | newbalanceOrig | nameDest | oldbalanceDest | newbalanceDest | isFraud | isFlaggedFraud | bal_chg | orig_zero | amt_bal_ratio | chg_amt_ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4 | 162 | 1 | 219742.61 | 2041924 | 25575.0 | 0.00 | 473203 | 323624.20 | 543366.81 | 0 | 0 | -25575.00 | 0 | 8.592086 | -0.116386 |
| 5 | 133 | 1 | 219742.55 | 2568121 | 0.0 | 0.00 | 304416 | 1365483.09 | 1569242.65 | 0 | 0 | 0.00 | 1 | 0.000000 | 0.000000 |
| 8 | 279 | 1 | 219742.27 | 1091742 | 0.0 | 0.00 | 322753 | 759457.06 | 979199.33 | 0 | 0 | 0.00 | 1 | 0.000000 | 0.000000 |
| 9 | 257 | 1 | 219742.26 | 1127672 | 29035.0 | 0.00 | 149889 | 686283.62 | 906025.88 | 0 | 0 | -29035.00 | 0 | 7.568185 | -0.132132 |
| 10 | 352 | 1 | 219742.19 | 1260399 | 904462.0 | 684719.81 | 241197 | 748163.19 | 967905.38 | 0 | 0 | -219742.19 | 0 | 0.242953 | -1.000000 |
# drop highly correlated columns: drop newbalanceOrig
col_to_drop = ['newbalanceOrig','nameDest', 'oldbalanceDest', 'newbalanceDest','isFlaggedFraud','nameOrig']
drop_columns(df=transformed_df, columns_to_drop=col_to_drop)
transformed_df.head()
| step | type | amount | oldbalanceOrg | isFraud | bal_chg | orig_zero | amt_bal_ratio | chg_amt_ratio | |
|---|---|---|---|---|---|---|---|---|---|
| 4 | 162 | 1 | 219742.61 | 25575.0 | 0 | -25575.00 | 0 | 8.592086 | -0.116386 |
| 5 | 133 | 1 | 219742.55 | 0.0 | 0 | 0.00 | 1 | 0.000000 | 0.000000 |
| 8 | 279 | 1 | 219742.27 | 0.0 | 0 | 0.00 | 1 | 0.000000 | 0.000000 |
| 9 | 257 | 1 | 219742.26 | 29035.0 | 0 | -29035.00 | 0 | 7.568185 | -0.132132 |
| 10 | 352 | 1 | 219742.19 | 904462.0 | 0 | -219742.19 | 0 | 0.242953 | -1.000000 |
df_selected = transformed_df.dropna()
df_selected.head()
| step | type | amount | oldbalanceOrg | isFraud | bal_chg | orig_zero | amt_bal_ratio | chg_amt_ratio | |
|---|---|---|---|---|---|---|---|---|---|
| 4 | 162 | 1 | 219742.61 | 25575.0 | 0 | -25575.00 | 0 | 8.592086 | -0.116386 |
| 5 | 133 | 1 | 219742.55 | 0.0 | 0 | 0.00 | 1 | 0.000000 | 0.000000 |
| 8 | 279 | 1 | 219742.27 | 0.0 | 0 | 0.00 | 1 | 0.000000 | 0.000000 |
| 9 | 257 | 1 | 219742.26 | 29035.0 | 0 | -29035.00 | 0 | 7.568185 | -0.132132 |
| 10 | 352 | 1 | 219742.19 | 904462.0 | 0 | -219742.19 | 0 | 0.242953 | -1.000000 |
2 Descriptive Statistics and Heatmaps for Fraud and Non-Fraud Samples
plot_descriptive_stats_heatmaps(df_selected, 'isFraud')
fraud_stats = df_selected[df_selected['isFraud'] == 1].describe().T
nofraud_stats = df_selected[df_selected['isFraud'] == 0].describe().T
fraud_mean = fraud_stats['mean']
fraud_mean = fraud_mean.drop('isFraud', errors='ignore')
nofraud_mean = nofraud_stats['mean']
nofraud_mean = nofraud_mean.drop('isFraud', errors='ignore')
fraud_class_observation = fraud_mean - nofraud_mean
import plotly.graph_objects as go
import pandas as pd
# Sample data - replace with your actual fraud_mean and nofraud_mean
df_viz = pd.DataFrame({'Feature': fraud_mean.index, 'Fraud Mean': fraud_mean.values, 'No Fraud Mean': nofraud_mean.values})
# Dot Plot with specific color for Fraud
fig_dot = go.Figure()
fig_dot.add_trace(go.Scatter(x=df_viz['Feature'], y=df_viz['Fraud Mean'], mode='markers', name='Fraud', marker_color='red'))
fig_dot.add_trace(go.Scatter(x=df_viz['Feature'], y=df_viz['No Fraud Mean'], mode='markers', name='No Fraud', marker_color='blue'))
fig_dot.update_layout(title="Scatter Plot of Fraud and No Fraud Means", xaxis_title="Feature", yaxis_title="Mean Value")
fig_dot.show()
# Creating a DataFrame for plotting
df_fraud = pd.DataFrame({'Feature': fraud_mean.index, 'Log Mean Value': np.log1p(fraud_mean.values), 'Class': 'Fraud'})
df_nofraud = pd.DataFrame({'Feature': nofraud_mean.index, 'Log Mean Value': np.log1p(nofraud_mean.values), 'Class': 'No Fraud'})
df_combined = pd.concat([df_fraud, df_nofraud])
# Creating the scatter plot with specific colors and log-transformed mean values
fig_scatter = px.scatter(df_combined, x='Feature', y='Log Mean Value', color='Class',
color_discrete_map={'Fraud': 'red', 'No Fraud': 'blue'},
title='Scatter Plot of Log-Transformed Fraud and No Fraud Means',
opacity=0.5)
fig_scatter.show()
C:\Users\Administrator\AppData\Local\Temp\ipykernel_3084\2164487125.py:16: RuntimeWarning: invalid value encountered in log1p C:\Users\Administrator\AppData\Local\Temp\ipykernel_3084\2164487125.py:17: RuntimeWarning: invalid value encountered in log1p
print(f'\nFraud Class Mean')
print((fraud_stats['mean']).round(2))
print(f'\nNo Fraud Class Mean\n')
print((nofraud_stats['mean']).round(2))
print(f'\nMean Difference')
print(fraud_class_observation)
Fraud Class Mean step 368.08 type 1.00 amount 1470832.67 oldbalanceOrg 1652887.65 isFraud 1.00 bal_chg -1460119.48 orig_zero 0.00 amt_bal_ratio 1.00 chg_amt_ratio -0.99 Name: mean, dtype: float64 No Fraud Class Mean step 241.63 type 1.00 amount 314115.50 oldbalanceOrg 42879.69 isFraud 0.00 bal_chg -27311.99 orig_zero 0.47 amt_bal_ratio 145.97 chg_amt_ratio -0.19 Name: mean, dtype: float64 Mean Difference step 1.264515e+02 type 0.000000e+00 amount 1.156717e+06 oldbalanceOrg 1.610008e+06 bal_chg -1.432807e+06 orig_zero -4.706822e-01 amt_bal_ratio -1.449771e+02 chg_amt_ratio -8.078642e-01 Name: mean, dtype: float64
df_selected.to_csv('fraud_data.csv', index=False)